R Package Dependencies - Installability


In [152]:
import pandas
import deps
import itertools
from collections import OrderedDict

%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')

# Workaround if changes are made to deps.py and "Run All" is hit
deps = reload(deps)

In [153]:
data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata.csv', index_col=None)
sources = ['github', 'cran', 'bioc']
combinations = []
for n in range(len(sources)):
    combinations += [list(x) for x in itertools.combinations(sources, n+1)]

Let us create the graphs for several dates.


In [154]:
graphs = OrderedDict()
for date in pandas.date_range(start='2013-09', end='2015-01', freq='6M'):
        graphs[date] = deps.create_graph_for(data, date)

Let us compute which are the packages that are installable.


In [155]:
installability = OrderedDict()
packages = OrderedDict()

for date, graph in graphs.iteritems():
    date = '{}-{}-{}'.format(date.year, date.month, date.day)
    installability[date] = {}
    packages[date] = {}
    for from_source in sources:
        installability[date][from_source] = {}
        packages[date][from_source] = filter(lambda p: deps.available(graph, p, [from_source]), graph.iterkeys())
        for combination in [['None']] + combinations: 
            if combination == ['None']:
                installable = deps.installable(graph, from_source, [])
            else:
                installable = deps.installable(graph, from_source, combination)
            installability[date][from_source][' '.join(combination)] = installable

Evolution of the number of packages


In [156]:
df_packages = pandas.DataFrame.from_dict({k: {k2: len(v2) for k2, v2 in v.iteritems()} for k, v in packages.iteritems()}, orient='index')
df_packages.index = pandas.to_datetime(df_packages.index)
df_packages = df_packages.sort_index()[sources]
ax = df_packages.plot(title=u'Number of available packages', 
                      style=[None, None, None, '--', '--', '--', ':'],
                      figsize=(15,6))
ax.legend(ncol=2, loc='upper left')


Out[156]:
<matplotlib.legend.Legend at 0x7f348a73f290>

In [157]:


In [158]:
number = OrderedDict()
for date, graph in graphs.iteritems():
    date = '{}-{}-{}'.format(date.year, date.month, date.day)
    number[date] = {'github': 0, 'cran': 0, 'both': 0}
    for name, package in graph.iteritems():
        github, cran = False, False
        
        for source in package.iterkeys():
            if source  == 'github':
                github = True
            elif source == 'cran':
                cran = True
        if github:
            number[date]['github'] += 1
        if cran:
            number[date]['cran'] += 1
        if github and cran:
            number[date]['both'] += 1
            
df_N = pandas.DataFrame.from_dict(number, orient='index')
df_N.index = pandas.to_datetime(df_N.index)
df_N = df_N.sort_index()
df_N['githubP'] = 100. * df_N['both'] / df_N['github']
df_N['cranP'] = 100. * df_N['both'] / df_N['cran']

ax = df_N[['github', 'cran', 'both']].plot(title=u'Number of available packages\n', 
                                         figsize=(8,4), ylim=(0,8000))
ax.legend(['github (left)', 'cran (left)', 'github $\cap$ cran (left)'], ncol=1, loc='best')

ax2 = ax.twinx()
ax2 = df_N[['githubP', 'cranP']].plot(ax=ax2, ylim=(0,100), style=['--', '--'], legend=False, grid=False)        

ax2.set_yticklabels([str(int(v))+'%' for v in ax2.get_yticks()])
df_N


Out[158]:
both cran github githubP cranP
2013-09-30 509 4852 1817 28.013209 10.490519
2014-03-31 713 5489 2898 24.603175 12.989616
2014-09-30 1004 6215 4609 21.783467 16.154465

Evolution of the installability


In [159]:
df_installability = {}

for source in sources: 
    df_installability[source] = pandas.DataFrame.from_dict(
        {k: {k2: len(v2) for k2, v2 in v[source].iteritems()} for k, v in installability.iteritems()}, orient='index')
    
for source in sources: 
    df = df_installability[source]
    df.index = pandas.to_datetime(df.index)
    df = df.sort_index()[[' '.join(comb) for comb in [['None']] + combinations]]
    ax = df.plot(title=u'Number of installable packages from {} using given set of sources'.format(source), 
                 style=['k:', 'b', 'g', 'r', 'b--', 'g--', 'r--', 'r:'],
                 figsize=(15, 6))
    ax.legend(ncol=2, loc='best')



In [160]:
# Focus on Github

df = df_installability['github']
df.index = pandas.to_datetime(df.index)
df['number'] = df_N['github']

df = df.sort_index()[['number'] + [' '.join(comb) for comb in [['None']] + combinations]]

for key in ['None', 'github', 'cran', 'github cran']:
    #df[key] = 100.0 * df[key] / df['number']
    pass

ax = df[['None', 'github', 'cran', 'github cran']].plot(title=u'Installable packages from GitHub\n', 
             style=['k--', 'b', 'g', 'r--'],
             ylim=(0,100),
             figsize=(8,4))
ax.legend(['None', 'github', 'cran', 'github $\cup$ cran'], ncol=1, loc='best')   
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])

df


Out[160]:
number None github cran bioc github cran github bioc cran bioc github cran bioc
2013-09-30 1817 497 503 511 498 513 504 512 514
2014-03-31 2898 789 800 824 790 826 801 825 827
2014-09-30 4609 1310 1339 1366 1312 1371 1341 1368 1373